library(tidyverse)
## ── Attaching packages ──────────
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.3.4 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ───────────────────
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
library(stringr)
library(forcats)
library(viridis)
## Loading required package: viridisLite
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Import and clean the data "Instacart".
instacart = read_csv("./instacart_train_data.csv.zip") %>%
clean_names()
## Warning in strptime(x, format, tz = tz): unknown timezone 'zone/tz/2017c.
## 1.0/zoneinfo/America/New_York'
## Parsed with column specification:
## cols(
## order_id = col_integer(),
## product_id = col_integer(),
## add_to_cart_order = col_integer(),
## reordered = col_integer(),
## user_id = col_integer(),
## eval_set = col_character(),
## order_number = col_integer(),
## order_dow = col_integer(),
## order_hour_of_day = col_integer(),
## days_since_prior_order = col_integer(),
## product_name = col_character(),
## aisle_id = col_integer(),
## department_id = col_integer(),
## aisle = col_character(),
## department = col_character()
## )
How many items ordered for each department?
instacart %>%
count(department) %>%
rename(items = n) %>%
mutate(department = fct_reorder(department,items)) %>%
plot_ly(x = ~department, y = ~items, color = ~department, type = "bar")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
What is the distribution of the order hour of the day for each department? It seems that most orders are between 8.00-18.00.
instacart %>%
mutate(department = fct_reorder(department, order_hour_of_day, IQR)) %>%
select(department, order_hour_of_day) %>%
## The IQR of each department was in increasing order.
plot_ly(y = ~order_hour_of_day, color = ~department, type = "box", colors = "Set2")
Products with a high number of orders are naturally more likely to be reordered. However, there seems to be a ceiling effect.
## reorder
reordered = instacart %>%
filter(reordered == 1) %>%
count(product_id) %>%
rename(reordered = n)
instacart %>%
count(product_id, department) %>%
rename(ordered = n) %>%
left_join(reordered) %>%
mutate(reordered = ifelse(is.na(reordered),0,reordered),
reo_ratio = reordered/ordered,
tlabel = str_c("Product_ID: ", product_id, product_id, '\nDepartment: ', department)) %>%
plot_ly(x = ~ordered,y = ~reo_ratio, type = "scatter", mode = "markers", alpha = 0.5, text = ~tlabel, color = I("black"))
## Joining, by = "product_id"